home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
C/C++ Users Group Library 1996 July
/
C-C++ Users Group Library July 1996.iso
/
vol_300
/
333_02
/
regex1.c
< prev
next >
Wrap
C/C++ Source or Header
|
1989-04-21
|
19KB
|
719 lines
/* Extended regular expression matching and search.
Copyright (C) 1985 Free Software Foundation, Inc.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "awk.h"
/* To test, compile with -Dtest. This Dtestable feature turns this into */
/* a self-contained program which reads a pattern, describes how it */
/* compiles, then reads a string and searches for it. */
STATIC VOID NEAR PASCAL init_syntax_once(void);
STATIC VOID NEAR PASCAL store_jump(char *from, char opcode, char *to);
STATIC VOID NEAR PASCAL insert_jump(char op, char *from,
char *to, char *current_end);
/* JF this var has taken on whole new meanings as time goes by. Various */
/* bits in this int tell how certain pieces of syntax should work */
static int obscure_syntax = 0;
/* Define the syntax stuff, so we can do the \<...\> things. */
STATIC VOID NEAR PASCAL init_syntax_once(void)
{
register int c;
static BOOL done = FALSE;
if (!done)
{
memset(re_syntax_table, 0, sizeof(re_syntax_table));
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
for (c = 'A'; c <= 'Z'; c++)
re_syntax_table[c] = Sword;
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
done = TRUE;
}
return;
}
/* compile_pattern takes a regular-expression descriptor string in the */
/* user's format and converts it into a buffer full of byte commands for */
/* matching. */
/* */
/* pattern is the address of the pattern string */
/* size is the length of it. */
/* bufp is a struct re_pattern_buffer * which points to the info */
/* on where to store the byte commands. */
/* This structure contains a char * which points to the */
/* actual space, which should have been obtained with malloc. */
/* compile_pattern may use realloc to grow the buffer space. */
/* */
/* The number of bytes of commands can be found out by looking in the */
/* struct re_pattern_buffer that bufp pointed to, after compile_pattern */
/* returns. */
#define PATPUSH(ch) (*b++ = (char) (ch))
#define PATFETCH(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; }
#define PATFETCH_RAW(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; }
#define PATUNFETCH p--
#define EXTEND_BUFFER \
{ char *old_buffer = bufp->buffer; \
if (bufp->allocated == (1<<16)) goto too_big; \
bufp->allocated *= 2; \
if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
goto memory_exhausted; \
c = bufp->buffer - old_buffer; \
b += c; \
if (fixup_jump) fixup_jump += c; \
if (laststart) laststart += c; \
begalt += c; \
if (pending_exact) pending_exact += c; \
}
/* JF this function is used to compile UN*X style regexps. In particular, */
/* ( ) and | don't have to be \ed to have a special meaning */
int PASCAL re_set_syntax(int syntax)
{
auto int ret;
ret = obscure_syntax;
obscure_syntax = syntax;
return(ret);
}
char * PASCAL re_compile_pattern(char *pattern, int size, REPAT_BUFFER *bufp)
{
register char *b = bufp->buffer;
register char *p = pattern;
auto char *pend = pattern + size;
register unsigned c, c1;
auto char *p1;
/* address of the count-byte of the most recently inserted "exactn" */
/* command. This makes it possible to tell whether a new exact-match */
/* character can be added to that command or requires a new "exactn" */
/* command. */
auto char *pending_exact = 0;
/* address of the place where a forward-jump should go to the end of the */
/* containing expression. Each alternative of an "or", except the last, */
/* ends with a forward-jump of this sort. */
auto char *fixup_jump = 0;
/* address of start of the most recently finished expression. This tells */
/* postfix * where to find the start of its operand. */
auto char *laststart = 0;
/* In processing a repeat, TRUE means zero matches is allowed */
auto BOOL zero_times_ok;
/* In processing a repeat, TRUE means many matches is allowed */
auto BOOL many_times_ok;
/* address of beginning of regexp, or inside of last \( */
auto char *begalt = b;
/* Stack of information saved by \( and restored by \). Four stack */
/* elements are pushed by each \(: First, the value of b. Second, the */
/* value of fixup_jump. Third, the value of regnum. Fourth, the value of*/
/* begalt. */
auto int stackb[40];
auto int *stackp = stackb;
auto int *stacke = stackb + 40;
auto int *stackt;
/* Counts \('s as they are encountered. Remembered for the matching \), */
/* where it becomes the "register number" to put in the stop_memory */
/* command */
auto int regnum = 1;
bufp->fastmap_accurate = 0;
init_syntax_once();
if (bufp->allocated == 0)
{
bufp->allocated = 28;
if (bufp->buffer)
/* EXTEND_BUFFER loses when bufp->allocated is 0 */
bufp->buffer = (char *) realloc(bufp->buffer, 28);
else
/* Caller did not allocate a buffer. Do it for him */
bufp->buffer = (char *) malloc(28);
if (!bufp->buffer)
goto memory_exhausted;
begalt = b = bufp->buffer;
}
while (p != pend)
{
if (b - bufp->buffer > bufp->allocated - 10)
EXTEND_BUFFER; /* Note that EXTEND_BUFFER clobbers c */
PATFETCH(c);
switch (c)
{
case '$':
/* $ means succeed if at end of line, but only in special */
/* contexts. If randomly in the middle of a pattern, it is */
/* a normal character. */
if (p == pend || (*p == '\\' && (p[1] == ')' || p[1] == '|')))
{
PATPUSH(RECODE_ENDLINE);
break;
}
goto normal_char;
case '^':
/* ^ means succeed if at beg of line, but only if no */
/* preceding pattern. */
if (laststart)
goto normal_char;
PATPUSH(RECODE_BEGLINE);
break;
case '*':
case '+':
case '?':
/* If there is no previous pattern, char not special. */
if (!laststart)
goto normal_char;
/* If there is a sequence of repetition chars, collapse it
* down to equivalent to just one. */
zero_times_ok = FALSE;
many_times_ok = FALSE;
while (TRUE)
{
zero_times_ok |= (c != '+');
many_times_ok |= (c != '?');
if (p == pend)
break;
PATFETCH(c);
if (!(c == '*' || c == '+' || c == '?'))
{
PATUNFETCH;
break;
}
}
/* Now we know whether 0 matches is allowed, and whether 2 or
* more matches is allowed. */
if (many_times_ok)
{
/* If more than one repetition is allowed, put in a
* backward jump at the end. */
store_jump(b, RECODE_MAYBE_FINALIZE_JUMP, laststart - 3);
b += 3;
}
insert_jump(RECODE_ON_FAILURE_JUMP, laststart, b + 3, b);
pending_exact = 0;
b += 3;
if (!zero_times_ok)
{
/* At least one repetition required: insert before the
* loop a skip over the initial on-failure-jump
* instruction */
insert_jump(RECODE_DUMMY_FAILURE_JUMP, laststart,
laststart + 6, b);
b += 3;
}
break;
case '.':
laststart = b;
PATPUSH(RECODE_ANYCHAR);
break;
case '[':
if (b - bufp->buffer
> bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
/* Note that EXTEND_BUFFER clobbers c */
EXTEND_BUFFER;
laststart = b;
if (*p == '^')
{
PATPUSH(RECODE_CHARSET_NOT);
++p;
}
else
PATPUSH(RECODE_CHARSET);
p1 = p;
PATPUSH((1 << BYTEWIDTH) / BYTEWIDTH);